##### @Mike
# Can you add a few sentences about how you got the data (i.e. the struggles you went through)
# Import necessary modules
import os
import glob
import pandas as pd
import statsmodels.formula.api as smf
import plotly.graph_objects as go # for interactive plot with human defined labels
import plotly.express as px
# Load necessary data
path = "/Users/ZXue/src/speeding-up-sci-correlation/"
df = pd.read_table(os.path.join(path, "TARA_025-merged-only-those-in-both-TPM.tsv"))
df.head() # a look at the sample
## Add KO ID to df
df_anno = pd.read_table(os.path.join(path, "Our-Kegg-annotations.tsv"))
df_anno.head()
df = df.merge(df_anno, on='Gene')
# remove genes that are not annotated with a KO ID
df.dropna()
## Add pathway info to df
df_path = pd.read_table(os.path.join(path, "Kegg-Orthology-table.tsv"))
df_path.head()
df = df.merge(df_path, on='KO_ID')
df.head()
# Make the figure with scatter dots
tracedot = go.Scatter(x=df['DNA'], # User should change this based on dataframe columns
y=df['RNA'], # User defined
mode='markers', # for scatter plot
text=df['KO_ID'], # hover text
#color=df['Category1'],
name='Data') # User defined
# Initialise and fit linear regression model using `statsmodels`
### tutorial https://towardsdatascience.com/introduction-to-linear-regression-in-python-c12a072bedf0
model = smf.ols('RNA ~ DNA', data=df)
model = model.fit()
# get linear regression parameters
model.params
# get linear regression R^2
model.rsquared
# get linear regression p-value
model.pvalues
# linear regression equation
# line= 0.495884*xi+11.028804
max_val = df['DNA'].max()
df2 = [[0, 11.028804],[max_val, 0.495884*max_val+11.028804]]
df2 = pd.DataFrame(df2, columns = ['xi', 'y'])
# Make the line graph
trace_ln = go.Scatter(
x=df2['xi'],
y=df2['y'],
mode='lines',
name='Fit'
)
annotation = go.layout.Annotation(
x=500,
y=15000,
text='$R^2 = 0.025,\\y = 0.495884x + 11.028804$',
showarrow=False,
font=go.Font(size=16)
)
layout = go.Layout(annotations=[annotation])
# Tie everything together to a figure
fig = go.Figure([tracedot,trace_ln], layout=layout)
# Edit the figure layout
fig.update_layout(title='Gene correlations between metagenomics and metatranscriptomic resutls',
xaxis_title='Count (Metagenomics)',
yaxis_title='Count (Metatranscriptomics)')
fig.show()
# go trace plotting can not set color to dots by category
## so I am now trying to set color with px plotting
## and add line and text through adding trace
figdot = px.scatter(df, x="DNA", y="RNA", color="Category1", hover_data=['KO_ID'])
# a "dummy" scatter plot trace that is used to carry the text
trace_txt = go.Scatter(
x=[700],
y=[15000],
mode='markers+text',
text=['$R^2 = 0.025,\\y = 0.495884x + 11.028804$'],
marker=dict(size = 1),
name='linear regression model',
textposition='bottom center'
)
figdot.add_trace(trace_ln).add_trace(trace_txt)
#go.Figure(figdot, layout=layout) # why is the text in layout settings not showing up?!
figdot.show()
'''An example of interesting results: K01574 was expressed at very high level in metatrancriptomic results but its "copy number"
is relatively low by metagenomic measurements. '''
'''This KO ID is linked to both carbohydrate and lipid metabolism, suggesting robust microbial activities
within in the sampled Tara oceam specimen.'''
df[df['KO_ID'] == 'K01574']
## To make a web-based application using plotly?